df25 = pd.read_excel("datasets\\Basketball_ref_2425.xlsx") #print(df25.info()) df24 = pd.read_excel("datasets\\Basketball_ref_2324.xlsx") #print(df24.info()) concat_df = pd.concat([df ,df24, df25], ignore_index = True) final_df = concat_df.drop_duplicates( subset=['PLAYER_NAME', 'SEASON_ID'], keep='first' ) print(final_df.sample(7)) print(final_df[final_df["PLAYER_NAME"] == "Victor Wembanyama"]) print(final_df.info())
#Top 10 Per quantative categories top_pts = df_stats.sort_values(by="PTS", ascending=False).head(10) print("Top 10 Scorers (Avg Points per Game):") print(top_pts[["PLAYER_NAME", "SEASON_ID", "PTS"]],"\n") top_reb = df_stats.sort_values(by="REB", ascending=False).head(10) print("Top 10 Rebounders (Avg Rebounds per Game):") print(top_reb[["PLAYER_NAME", "SEASON_ID", "REB"]],"\n") top_ast = df_stats.sort_values(by="AST", ascending=False).head(10) print("Top 10 Passers (Avg Assists per Game):") print(top_ast[["PLAYER_NAME", "SEASON_ID", "AST"]],"\n") top_3PM = df_stats.sort_values(by="FG3M", ascending=False).head(10) print("Top 10 3 pointers (Avg 3 points made per Game):") print(top_3PM[["PLAYER_NAME", "SEASON_ID", "FG3M"]])
#creating train and test splits using the train-test-split function model_df = df_train.drop(columns=[c for c in df_train.columns if c not in model_stats], inplace = False) model_target = df_train["delta_fantasy"] model_df_train, model_df_test, model_target_train, model_target_test = train_test_split(model_df, model_target, test_size=0.2, random_state=42) linear_model_df = df_linear_train.drop(columns=[c for c in df_linear_train.columns if c not in model_stats], inplace = False) linear_model_target = df_linear_train["next_fantasy_z_9cat"] linear_model_df_train, linear_model_df_test, linear_model_target_train, linear_model_target_test = train_test_split(linear_model_df, linear_model_target, test_size=0.2, random_state=42) #print(linear_model_df_train.info()) #print(model_df_train.head()) #print(model_df_test.info())
pipe_R = Pipeline([ ('preprocessor', preprocessor), ('Ridge', Ridge()) ]) # Define alpha values to test alpha_values = [0.01, 0.1, 1.0, 10.0, 100.0, 150.0] # Set up GridSearchCV grid = GridSearchCV( estimator=pipe_R, param_grid={'Ridge__alpha': alpha_values}, cv=5, scoring='r2', return_train_score=False ) # 💡 FIX: Pass sample_weights correctly using fit_params # The key is '<step_name>__sample_weight' fit_params = {'Ridge__sample_weight': sample_weights} # Fit with sample weights # You pass the fit_params dictionary as keyword arguments (using **) grid.fit(linear_X_train, y_train, **fit_params) # Note: In newer scikit-learn versions, this syntax: # grid.fit(linear_X_train, y_train, Ridge__sample_weight=sample_weights) # *should* work if sample_weights is a one-dimensional array, # but using the dict and ** is the canonical and often safer way. y_pred = grid.predict(linear_X_test) # Best alpha and score print("Best alpha:", grid.best_params_['Ridge__alpha']) print("Best CV score:", grid.best_score_) test_r2_score = r2_score(y_test, y_pred) print("-" * 30) print(f"Optimal Alpha found via CV: {grid.best_params_['Ridge__alpha']}") print(f"Cross-Validation (Training) R2 Score: {grid.best_score_:.4f}") print(f"Final **Unseen Test Set** R2 Score: {test_r2_score:.4f}") print("-" * 30)